<meta charset="utf-8">
<title>GEO data & harmony</title>
<style>
/*
v0.1 可用
*/
pre{
	word-wrap: break-word!important; 
	white-space: pre-wrap;
	padding: 0 10px;
}

</style>


<pre>
# Aim: load data from GEO, use Harmony instead PC to do follow steps

setwd("/home/wangjl/data/project/outsource/shenwh/")
getwd()

outputRoot=""

library(Seurat)
library(ggplot2)
library(dplyr)
library(patchwork)

obj=read.table("GSE181919_UMI_counts.txt")
dim(obj) #20000 54239
obj[1:4, 1:4]


colnames(obj)=sub("\\.", "-", colnames(obj) )
obj[1:4, 1:4]


meta =read.table("GSE181919_Barcode_metadata.txt")
dim(meta) #54239     8
meta[1:4, ]

scObj <- CreateSeuratObject(counts = obj, project = "HNSCC", 
                            min.cells = 3, min.features = 200,
                            meta.data = meta)
scObj
#rm(obj)
gc()


head( rownames( scObj@meta.data) )
head( colnames(scObj@assays$RNA@counts) )
head( scObj@meta.data)


dim(scObj@assays$RNA@data) #20000 54239
dim(scObj@assays$RNA@data)



scObj[["percent.mt"]] <- PercentageFeatureSet(scObj, pattern = "^MT-")
VlnPlot(scObj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, pt.size=0)


plot1 <- FeatureScatter(scObj, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(scObj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2


#DimPlot(scObj)

scObj <- NormalizeData(scObj, normalization.method = "LogNormalize", scale.factor = 10000)
scObj <- FindVariableFeatures(scObj, selection.method = "vst", nfeatures = 2000)
all.genes <- rownames(scObj)
scObj <- ScaleData(scObj, features = all.genes)
scObj=RunPCA(scObj)



# Harmony ----
# install.packages("harmony")
library( harmony )

# 代替PC，用于后续
scObj = RunHarmony(scObj, "sample.id", plot_convergence = TRUE)
scObj@reductions$harmony[1:10, 1:5]

#细胞的嵌入
harmony_embeddings <- Embeddings(scObj, 'harmony')
harmony_embeddings[1:5, 1:5]



# PCA(use harmony instead) ----
scObj <- RunPCA(scObj, features = VariableFeatures(object = scObj))
DimPlot(scObj, reduction = "pca")

ElbowPlot(scObj, ndims = 50)





# umap ----
scObj <- RunUMAP(scObj, dims = 1:50, reduction = "harmony")


DimPlot(scObj, reduction = "umap", label = T, group.by = "patient.id")


# Cell cluster ----
scObj <- FindNeighbors(scObj, dims = 1:50, reduction = "harmony")
scObj <- FindClusters(scObj, resolution = 0.2)

DimPlot(scObj, reduction = "umap", label = T)

DimPlot(scObj, reduction = "umap", label = T, group.by = "cell.type")

table(scObj$sample.id)
table(scObj$patient.id)
table(scObj$subsite)

table(scObj$seurat_clusters, scObj$cell.type)

head(colnames(scObj))
head(rownames(meta))
table( colnames(scObj) == rownames(meta) )

table(meta$cell.type)


# marker ----

## manual ----
markers=list(
  malignant=c('KRT14', 'KRT17', 'KRT6A', 'KRT5', 'KRT19', 'KRT8', 'KRT16', 'KRT18', 'KRT6B', 'KRT15', 'KRT6C', 'KRTCAP3', 'EPCAM', 'SFN'), 
  fibroblasts=c('FAP', 'PDPN', 'COL1A2', 'DCN', 'COL3A1', 'COL6A1'), 
  myocytes=c('ACTA1', 'ACTN2', 'MYL2', 'MYH2'), 
  T =c('CD2', 'CD3D', 'CD3E', 'CD3G', 'CD4', 'CD8A', 'CD8B', 'FOXP3'),
  NK=c('NKG7', 'GNLY', 'GZMA', 'GZMB'),
  B_Plasma = c( 'CD79A', 'CD79B', 'CD19', 'MS4A1', 'SDC1',
               'SLAMF7','BLNK', 'FCRL5'), 
  mono=c('CD14', "FCGR3A"),
  macrophages=c('CD163', 'CD68', 'FCGR2A', 'CSF1R'), 
  dendritic=c('ITGAX','CD40', 'CD80', 'CD83', 'CCR7'), 
  mast = c('CMA1', 'MS4A2', 'TPSAB1', 'TPSB2'),
  endothelial =c('PECAM1', 'VWF', 'ENG'),
  cycle=c('MKI67', 'CCND1', 'CCND3', 'CCNE1', #'CCNE2', 
          'CCNB1', 'CCNB2'),
  'APA' =c("SCCPDH", 'NUDT21', "CPSF6"),
  stem=c('CD34','SOCS2', 'SOCS1'),
  meta=c("GAPDH", 'ACTB')
)
DotPlot(scObj, features = markers, cluster.idents = T) + RotatedAxis()
DotPlot(scObj, features = markers, group.by = "cell.type") + RotatedAxis()

DotPlot(scObj, features = unique(c(
  'NUDT21', 'SCCPDH', 'GRB10', "UBFD1", "IFT20", "NUTF2", "CD59", "SFT2D1", 
  "BLOC1S5",  "MAP2", "CDKN2B", "KRAS",  
  'CD34', 'SOCS2',
  c('CD2', 'CD3D', 'CD3E', 'CD3G', 'CD4', 'CD8A', 'CD8B'), #T
  'KRT19', 'KRT8', 'KRT16', 'KRT18', #epithelial
  
  grep("^CPSF", rownames(scObj), value=T),
  grep("^CSTF", rownames(scObj), value=T),
  grep("^GZM", rownames(scObj), value=T),
  grep("^CCN", rownames(scObj), value=T),
  #grep("^SOCS", rownames(scObj), value=T),
  #grep("^IGH", rownames(scObj), value=T),
"GAPDH")), cluster.idents = T) + RotatedAxis()

FeaturePlot(scObj, features = c("NUDT21", 'CPSF6', 'SCCPDH', 'MKI67', 'CD34', 'SOCS2'),
            cols = c('lightgrey', 'red'), pt.size = 0.1, ncol=3) & NoLegend()

table(scObj$cell.type)


tmp=FindMarkers(scObj, ident.1 = "Malignant.cells", ident.2 = "Epithelial.cells",
            group.by = "cell.type")

DotPlot(scObj, features = unique(
  tmp %>% filter(p_val_adj<0.05) %>% top_n(n = 20, wt = avg_log2FC) %>%
    rownames()
), cluster.idents = T) + RotatedAxis()

VlnPlot(scObj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), 
        ncol = 1, pt.size=0)

DotPlot(scObj, features = c("CD79A", 
                                grep("^IG", rownames(scObj), value=T)))+RotatedAxis()
FeaturePlot(scObj, features = c("CD79A",  "IGHG4", "IGLC7"          ))+RotatedAxis()


DotPlot(scObj, features =c("CD34", 'NUDT21', 'SCCPDH', 'CCNB1'), 
        
        split.by = "tissue.type") + RotatedAxis()

scObj$tissue.type=factor(scObj$tissue.type, 
                         levels = c("NL", "LP",  'CA',    'LN'))
DotPlot(scObj, features = unique(c(
  "CD34", 'NUDT21', 'SCCPDH', 'CCNB1',
  "MKI67", "CCNB2", "CCNE1",
  unlist(markers)
)), cluster.idents = F, 
split.by = "tissue.type", cols = c("red", 'orange', "green", 'blue')
)+RotatedAxis() + ggtitle("among 4 tissue types") + ggplot2::coord_flip()
ggsave("a1.scObj-by-tissue.sample.Seurat.dotplot.pdf", width=16, height=15)


## auto marker (30min) ----




### (2c) T cell markers ----
scObj.markers.T <- FindMarkers(scObj, ident.1 = "T.cells",group.by = "cell.type",
                               only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
write.table(scObj.markers.T, "a1.scObj-by-cluster.T-marker.txt")
DotPlot(scObj, features = unique(c(
  c("PTPRC", "CD3D", "CD3E", "CD3G", "CD4", "CD8A", "CD8B", 'FOXP3', "CD274", "GAPDH"),
  scObj.markers.T |>  filter(p_val_adj<0.05) %>% top_n(n = 800, wt = avg_log2FC) |> 
    rownames() |> head(100)
)), cols = c("lightgrey", "red"), cluster.idents = T ) + RotatedAxis()+
  ggtitle("T markers among all cells") + coord_flip()
ggsave("a1.scObj-by-cluster.T-marker.Seurat.dotplot.pdf", width=7.5, height=20)

DoHeatmap(scObj, features = scObj.markers.T |>  filter(p_val_adj<0.05) |> rownames() )




# find markers for every cluster compared to all remaining cells, report only the positive ones
scObj.markers <- FindAllMarkers(scObj, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

scObj.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_log2FC)


 c("SCCPDH", "MSL1",
  "CD34","KIT", "CD38", "SOCS2","GATA1",
  "MKI67","CCNE1", "CCNE2",
  "NUDT21", "CPSF6",
  "GAPDH",
  
  "PTPRC", "CD3D", "CD3E", "CD4", 'CD8A', 'CD8B',
  'CD79A', 'CD79B', 'IGHG4',
  'NKG7', 'GNLY', 
  "CD14", "FCGR3A",
  'C1QB')
DotPlot(scObj, features = unique(c( 
  #sapply(c("Fcer1a", "Itgb7", "Fcer1g", "Cpa3","Mcpt8", "Csf1"),toupper),
  #"EPCAM", 'KRT8', 'KRT18', 'KRT19', 'KRT5', 'KRT15', 
  #"CD3D", "CD3E", "CD4", "CD8A", "CD8B", "SELL",
  #"NOS2","IRF5","PTGS2", "CD163", "VSIG4", "MS4A4A", "ARG2",
  scObj.markers %>% group_by(cluster) %>%
    filter(cluster %in% c(15) ) %>%  #by cluster
    top_n(n = 70, wt = avg_log2FC) %>% #arrange( desc(pct.1) ) %>%
    pull("gene") %>% unique() %>% jsonlite::toJSON()
)), cluster.idents = T, cols=c('lightgrey', "red")) + RotatedAxis()+
  ggtitle("Myocyte")
#ggsave("a1.scObj.Seurat.dotplot.pdf", width=30, height=5.5)

grep("^PABP", rownames(scObj), value=T)
grep("^PAPOL", rownames(scObj), value=T)
DotPlot(scObj, features = unique(c(
  #"EPB41", "EPB41L2", "EPB41L3",
  #"CD274", 'LAG3',
  "SCCPDH",
  "NUDT21", 'CPSF6','CSTF2', 'FIP1L1', 'PAPOLA',
  'XBP1', 'GPX1', 'FLT3', 'PIM1', 'MYC', 'HOXA9', 'BMI1',
  #"CD79A", "CD79B", "MS4A1", "CD19",   "VPREB3",
  #"NOS2","IRF5","PTGS2", "CD163", "VSIG4", "MS4A4A",
  "FLT3", "CD34",'RUNX1', 'RUNX1T1'
  #sapply(
  #  #c("Bcl2","Rgs1","Xcl1","Dusp2","Sh2d2a","S100a10","Cd2","Ugcg","Lck","Txk","Vps37b","Gimap4","Gimap5","Klrb1a","Klrb1c","Klre1","Klrd1","Klrk1","Klrc2","Klri2","Klra4","Klra8","Klra9","Klra7","Klra3","Ncr1","Nkg7","Ifngr1","Prf1","Ifng","Gzmb","Eomes","Ccl5","Ccl3","Ccl4","Cd7","Serpinb6b","Serpinb9","Ctla2a","Gzma","Id2","Il2rb","Lgals1","Sytl3","H2-K1","H2-Q7","Ctsw","Ahnak","AW112010","Ms4a4b"),
  #  c("Fn1","Dbi","Vim","Zeb2","Cst3","Mafb","Ctsz","Psma7","Anxa5","S100a4","Ctss","Klf4","Prdx1","Sdc3","Plac8","Rassf4","Clec4a1","Clec4a3","Apoe","Ctsc","Ifitm3","Smpdl3a","Psap","Lyz2","Lrp1","Lamp1","Hpgd","Lpl","Hmox1","Lgals3","Ctsb","Trf","Ccr2","Cd68","Ccl9","Gngt2","Grn","F13a1","Npc2","Lgmn","Pld4","Crip1","Selenop","Lgals1","Aif1","Csf1r","Ahnak","Ms4a4c","Ms4a6c","Mpeg1"),
  #  toupper)
  #intersect(t1,t2),
  #setdiff(t1, t2), 
  #setdiff(t2, t1)
)), cluster.idents = T, cols=c('lightgrey', "red")) + RotatedAxis()#+
  ggtitle("mono")

FeaturePlot(scObj, features = c(
  "NOS2","IRF5","PTGS2", "CD163", "VSIG4", "MS4A4A"
), ncol=3, cols=c('lightgrey', "red")) & NoLegend()


### (100a)apa factors ----
apa_factors=list(
  CPSF=c("CPSF1","CPSF2","CPSF3", "CPSF3L", "CPSF4","CPSF4L","CPSF6","CPSF7"),
  CSTF=c("CSTF1","CSTF2","CSTF2T","CSTF3"),
  PABP=c("PABPC1","PABPC1L","PABPC3","PABPC4","PABPC4L","PABPC5","PABPN1"),
  PAPOL=c("PAPOLG", "PAPOLA"),
  other=c("SYMPK", "RBBP6", "WDR33", "FIP1L1", "NUDT21","CLP1", "PCF11", "SCCPDH")     #无法归类
)




### gene mean exp of core factors in each cluster ----
apa_exp_data = AverageExpression(scObj, features = unlist(apa_factors), slot="data", 
                                 group.by = "seurat_clusters")
apa_exp_data=apa_exp_data$RNA
apa_exp_data[1:4,1:4]
# rm all 0 rows
apa_exp_data=apa_exp_data[rowSums(apa_exp_data)>0,]
dim(apa_exp_data)

library(pheatmap)
pic1 = pheatmap(log1p(apa_exp_data),
         border_color = NA,
         scale = "row",
         clustering_method = 'ward.D2',
         #filename = "a10_apa_factors_HNSCC.heatmap.pdf", width = 5, height = 4.8,
         main="APA core factors in HNSCC")
#


## save Or load ----
# saveRDS(scObj, file = "data/scObj.Seurat.rds")
# scObj =readRDS(file = "data/scObj.Seurat.rds")
write.table(scObj.markers, "data/scObj.markers.txt")
# scObj.markers=read.table("data/scObj.markers.txt")

table(scObj$seurat_clusters)
scObj_down=subset(scObj, downsample=2000)
table(scObj_down$seurat_clusters)
DoHeatmap(scObj_down, 
          #features = unlist(apa_factors), 
          features = pic1$tree_row$labels[pic1$tree_row$order], 
          group.by = "seurat_clusters" )+
  ggtitle("version:1146")
#




#1. Macrophage(cluster4): normal -> leukoplakia -> primary-> metastasis ----
## (1) get macrophage cell
sc_sub = subset(scObj, idents=4)
sc_sub = subset(sc_sub, UMAP_1<5 & UMAP_2<5 & UMAP_2>-10)
sc_sub 
DimPlot(sc_sub, label = T)

table(sc_sub$tissue.type)
# CA   LN   LP   NL 
# 3361  659  514  694 
sc_sub$tissue.type=factor(sc_sub$tissue.type, levels = c("NL", "LP", 'CA', 'LN'))
DimPlot(sc_sub, label = F, split.by = "tissue.type")


## (2) compare between group
Idents(sc_sub)="tissue.type"
markers.tissue = FindAllMarkers(sc_sub, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

top10 <- markers.tissue %>% group_by(cluster) %>% top_n(n = 30, wt = avg_log2FC)
#DoHeatmap(sc_sub, features = unique(top10$gene), label = FALSE) +  #+ NoLegend()
#  scale_fill_gradientn(colors = c("blue", "white", "red"))+ggtitle("Macrophage")

DotPlot(sc_sub, features = unique(top10$gene), cols = c('lightgrey', 'red')) + 
  RotatedAxis() +  ggtitle("Macrophage cells")
write.table(markers.tissue, "a1.scObj-by-tissue.Marcrophage-DEG.txt")



#2. Epithelial(cluster3): normal -> leukoplakia -> primary-> metastasis ----
## (1) get macrophage cell
sc_sub3 = subset(scObj, idents=3)
sc_sub3 = subset(sc_sub3, UMAP_1<5 & UMAP_2< (-7) )
sc_sub3 
DimPlot(sc_sub3, label = T)

#table(sc_sub$tissue.type)
# CA   LN   LP   NL 
# 3366  659  514  694 
# 3361  659  514  694 after trim
sc_sub3$tissue.type=factor(sc_sub3$tissue.type, levels = c("NL", "LP", 'CA', 'LN'))
DimPlot(sc_sub3, label = F, split.by = "tissue.type")


## (2) compare between group
Idents(sc_sub3)="tissue.type"
markers.tissue3 = FindAllMarkers(sc_sub3, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

top10_3 <- markers.tissue3 %>% group_by(cluster) %>% top_n(n = 20, wt = avg_log2FC)
DoHeatmap(sc_sub3, features = unique(top10_3$gene), label = FALSE) +  #+ NoLegend()
  scale_fill_gradientn(colors = c("blue", "white", "red"))+ggtitle("Epithelial")





#3. fibro (cluster0) ----
## (1) get macrophage cell
sc_sub3 = subset(scObj, idents=3)
sc_sub3 = subset(sc_sub3, UMAP_1<5 & UMAP_2< (-7) )
sc_sub3 
DimPlot(sc_sub3, label = T)

#table(sc_sub3$tissue.type)
# CA   LN   LP   NL 
# 100  162 5204  245 
sc_sub3$tissue.type=factor(sc_sub3$tissue.type, levels = c("NL", "LP", 'CA', 'LN'))
DimPlot(sc_sub3, label = F, split.by = "tissue.type")


## (2) compare between group
Idents(sc_sub3)="tissue.type"
markers.tissue3 = FindAllMarkers(sc_sub3, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

top10_3 <- markers.tissue3 %>% group_by(cluster) %>% top_n(n = 20, wt = avg_log2FC)
DoHeatmap(sc_sub3, features = unique(top10_3$gene), label = FALSE) +  #+ NoLegend()
  scale_fill_gradientn(colors = c("blue", "white", "red"))+ggtitle("Epithelial")

DotPlot(sc_sub3, features = unique(top10_3$gene)) + RotatedAxis()+
  ggtitle("Epithelial cells")




#4. T cells(1,2,8) ----
FeaturePlot(scObj, features = c('PTPRC',"CD3D", "CD3E", 'CD3G', 'CD4', 'CD8A', 'CD8B', 
                                'GZMA', 'GZMB', 'GZMK', 'GZMH', "CD274"),
            cols = c('lightgrey', 'red'))&NoLegend()


## (1) get the cell
sc_sub4 = subset(scObj, idents=c(1,2,8) )
sc_sub4 = subset(sc_sub4, UMAP_1< (-2) & UMAP_2< (6) & UMAP_2> (-8) )
sc_sub4
DimPlot(sc_sub4, label = T)

table(sc_sub4$tissue.type)
# CA   LN   LP   NL 
#4138 1398 7601 4661 
sc_sub4$tissue.type=factor(sc_sub4$tissue.type, levels = c("NL", "LP", 'CA', 'LN'))
DimPlot(sc_sub4, label = F, split.by = "tissue.type")


## (2) compare between group
Idents(sc_sub4)="tissue.type"
markers.tissue4 = FindAllMarkers(sc_sub4, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

top10_4 <- markers.tissue4 %>% group_by(cluster) %>% top_n(n = 20, wt = avg_log2FC)
DoHeatmap(sc_sub4, features = unique(top10_4$gene), label = FALSE) +  #+ NoLegend()
  scale_fill_gradientn(colors = c("blue", "white", "red"))+ggtitle("T cells")
ggsave( paste0(outputRoot, "04_heatmap.tissule.T.pdf"), width=4, height=9.5, useDingbats = F)

DotPlot(sc_sub4, features = unique(top10_4$gene)) + RotatedAxis()+
  ggtitle("T cells")

# barplot
tbl_dat=table(sc_sub4$tissue.type, sc_sub4$seurat_clusters);tbl_dat
tbl_dat=tbl_dat[, colSums(tbl_dat)>0]
colors = c("blue", "orange", "red", "pink")
barplot(tbl_dat, col =colors , main="Cell number in each cluster")
legend("topright", legend = rownames(tbl_dat), fill =colors, border = NA,bty="n" )
#
barplot(apply(tbl_dat, 2, function(x){x/sum(x)}), col =colors , main="Cell number in each cluster")
#legend("topright", legend = rownames(tbl_dat), fill =colors, border = NA,bty="n" )



## (1c). DEG along cell.type ----
markers.tissue4 %>% filter(p_val_adj<0.05) %>% dim() #591   7

tmp1=markers.tissue4 %>% filter(p_val_adj<0.05) %>% 
  group_by(cluster) %>% top_n(n = 200, wt = avg_log2FC)

tmp1 |> dim()
DotPlot(sc_sub4, features = unique(tmp1$gene)[1:30] ) + RotatedAxis()+
  ggtitle("T cells")
# mean expression level
scObj_T.marksers = as.data.frame(AverageExpression(sc_sub4, group.by = "tissue.type" )$RNA)
scObj_T.marksers=scObj_T.marksers[order(-scObj_T.marksers$LN),]


head(scObj_T.marksers)

scObj_T.increase = (function(){
  result=c()
  len = nrow(scObj_T.marksers)
  for(i in 1:len){
    line = scObj_T.marksers[i, , drop=F]
    if(line[1,1]<=line[1,2] & line[1,2]<=line[1,3] & line[1,3]<=line[1,4]){
      result =c(result, rownames(line))
    }
  }
  print(length(result))
  result
})()
# 
#scObj_T.increase |> jsonlite::toJSON() #2734
# ["HES4","ISG15","FAM132A","UBE2J2","PUSL1","CPTP","AURKAIP1","MRPL20","SSU72"] 

# overlap
scObj.markers_ol = intersect(scObj_T.increase, tmp1$gene)
length(scObj.markers_ol) #118
length(unique(scObj.markers_ol)) #118
scObj.markers_ol |> jsonlite::toJSON()
#
DotPlot(sc_sub4, features = scObj.markers_ol[1:60], cols = c("lightgrey", "red") ) + RotatedAxis()+
  ggtitle("T cells: 1-60 increase genes")

DotPlot(sc_sub4, features = scObj.markers_ol[61:118], cols = c("lightgrey", "red") ) + RotatedAxis()+
  ggtitle("T cells: 61-118 increase genes")



#5. CD8+ T cells(2,8) ----
FeaturePlot(scObj, features = c('PTPRC',"CD3D", "CD3E", 'CD3G', 'CD4', 'CD8A', 'CD8B', 
                                'GZMA', 'GZMB', 'GZMK', 'GZMH', "CD274"),
            cols = c('lightgrey', 'red'))&NoLegend()


## (1) get the cell
sc_sub5 = subset(scObj, idents=c(2,8) )
sc_sub5 = subset(sc_sub5, UMAP_1< (-2) & UMAP_2< (6) & UMAP_2> (-7) )
sc_sub5
DimPlot(sc_sub5, label = T)

#table(sc_sub5$tissue.type)
# CA   LN   LP   NL 
#2153  616 4201 2707
sc_sub5$tissue.type=factor(sc_sub5$tissue.type, levels = c("NL", "LP", 'CA', 'LN'))
DimPlot(sc_sub5, label = F, split.by = "tissue.type")


## (2) compare between group
Idents(sc_sub5)="tissue.type"
markers.tissue5 = FindAllMarkers(sc_sub5, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

top10_5 <- markers.tissue5 %>% group_by(cluster) %>% top_n(n = 20, wt = avg_log2FC)
DoHeatmap(sc_sub5, features = unique(top10_4$gene), label = FALSE) +  #+ NoLegend()
  scale_fill_gradientn(colors = c("blue", "white", "red"))+ggtitle("CD8+ T cells")
#ggsave( paste0(outputRoot, "04_heatmap.tissule.T.pdf"), width=4, height=9.5, useDingbats = F)

DotPlot(sc_sub5, features = unique(top10_5$gene)) + RotatedAxis()+
  ggtitle("CD8+T cells")

# barplot
tbl_dat=table(sc_sub5$tissue.type, sc_sub5$seurat_clusters);tbl_dat
tbl_dat=tbl_dat[, colSums(tbl_dat)>0]
colors = c("blue", "orange", "red", "pink")
barplot(tbl_dat, col =colors , main="Cell number in each cluster")
legend("topright", legend = rownames(tbl_dat), fill =colors, border = NA,bty="n" )
#
barplot(apply(tbl_dat, 2, function(x){x/sum(x)}), col =colors , main="Cell number in each cluster")
#legend("topright", legend = rownames(tbl_dat), fill =colors, border = NA,bty="n" )






# 6. markers between T cells ----
## (1) get the cell
sc_sub6 = subset(scObj, idents=c(1, 2, 8) )
sc_sub6 = subset(sc_sub6, UMAP_1< (-2) & UMAP_2< (6) & UMAP_2> (-7) )
sc_sub6
DimPlot(sc_sub6, label = T)

#table(sc_sub6$tissue.type)
# CA   LN   LP   NL 
#4138 1398 7601 4661
sc_sub6$tissue.type=factor(sc_sub6$tissue.type, levels = c("NL", "LP", 'CA', 'LN'))
DimPlot(sc_sub6, label = F, split.by = "tissue.type")


## (2) compare between group
#Idents(sc_sub6)="tissue.type"
markers.tissue6 = FindAllMarkers(sc_sub6, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

top10_6 <- markers.tissue6 %>% group_by(cluster) %>% top_n(n = 20, wt = avg_log2FC)

# all T cells
DotPlot(sc_sub6, features = unique(c(
  "CD3D", "CD3E", "CD3G", "CD4", "CD8A", "CD8B", "FOXP3", 
  top10_6$gene)
)) + RotatedAxis()+ ggtitle("T cells")


DotPlot(sc_sub6, features = unique(c(
  "CD3D", "CD3E", "CD3G", "CD4", "CD8A", "CD8B", "FOXP3", 
  top10_6$gene)
), split.by = "tissue.type", 
cols = c("red", "pink", "blue", "orange" )) + 
  RotatedAxis()+ ggtitle("T cells, split.by=tissue.type")



# all cells
DotPlot(scObj, features = unique(c(
  "CD3D", "CD3E", "CD3G", "CD4", "CD8A", "CD8B", "FOXP3", 
  top10_6$gene)
), cluster.idents = T) + RotatedAxis()+ ggtitle("All cells")



## all cell barplot ----
tbl_dat=table(scObj$cell.type, scObj$seurat_clusters);tbl_dat
tbl_dat=tbl_dat[, colSums(tbl_dat)>0]
colors =1:10 #c("blue", "orange", "red", "pink")
barplot(tbl_dat, col =colors , main="Cell number in each cluster")
legend("topright", legend = rownames(tbl_dat), fill =colors, border = NA,bty="n" )
#
barplot(apply(tbl_dat, 2, function(x){x/sum(x)}), col =colors , main="Cell number in each cluster")
#legend("topright", legend = rownames(tbl_dat), fill =colors, border = NA,bty="n" )








# 7. correlation & APA ----

dat = FetchData(scObj, vars = c(
  #"FOXP3", 'CD274'
  "SCCPDH", "NUDT21"
  ) )
dat=dat[rowSums(dat)>0,]
head(dat)
sum(dat)

dat[1:10, ]
cor.test(dat[,1], dat[,2])

grep("CD300", rownames(scObj), value=T)
DotPlot(scObj, features = c("SCCPDH",
                            "CD3D", "CD3E", "CD4","CD8A","CD8B", "NKG7", "GZMA", "GZMB", "CD79A", 
                           
                            "CD34", "SOCS2",
                            "CD274", "LAG3", 'ICOS', "CTLA4", "TIGIT",
                            #"FIP1L1", "CSTF2", "CPSF3", 'CPSF2', 'CSTF3', 
                            #'NUDT21', 'CPSF6',
                            "CSF1R","CD68", grep("CD300", rownames(scObj), value=T),
                            #"MRPL45",
                            "CCNE1",'CCNE2', 'CCNB1', "MKI67",
                            "PCF11"
                            #'GAPDH'
                            ),
        #split.by = "tissue.type", cols = c("red", 'orange', "green", 'blue'),
        #group.by = "cell.type",
        #group.by = "tissue.type",
        group.by = "seurat_clusters",
        cluster.idents = F)+RotatedAxis() + coord_flip()




### (2)Re plot DotPlot ----

# named color
color.head=c('#617381', "#76AB9B", '#CE5A5B', "#EA7C5B");
names(color.head) =c("NL", "LP", "CA", "LN")
color.head
# draw with Seurat
g1=DotPlot(scObj, features = c("CD300LF"), group.by = "tissue.type", 
        cluster.idents = F)+labs(x="", y="") + NoLegend(); g1 #+ #+RotatedAxis() #+ coord_flip()
# >> get data from ggplot obj
head(g1$data)
#            avg.exp  pct.exp features.plot id avg.exp.scaled
#CD300LF  0.04571820 4.370236       CD300LF CA      1.0291080
#CD300LF1 0.03949094 4.144320       CD300LF LN      0.6632240
# re-plot with ggplot2
ggplot(g1$data, aes(x=features.plot, y=id, size=pct.exp, color=id))+
  geom_point(show.legend = F)+
  scale_color_manual(values=color.head)+
  theme_classic(base_size = 14)+
  labs(x="", y="")
#
  

VlnPlot(scObj, features = c("CD34", "SCCPDH", "LAG3", 
                            'NUDT21', 'CPSF6',"MKI67", 
                            #"CCNE1",'CCNE2', 
                            'CCNB1',  'SOCS2'),
        ncol = 2, pt.size = 0)

FeaturePlot(scObj, features = c("CD34", "SCCPDH", "NUDT21", "CPSF6", "MKI67", "CCNB1"), 
            cols = c("lightgrey", 'red'), pt.size = 0.1) & NoLegend()

sc_cluster8=subset(scObj, idents=8)
sc_cluster8=subset(sc_cluster8, UMAP_1<(-2) & UMAP_2 <5 & UMAP_2> (-7))
DimPlot(sc_cluster8)
VlnPlot(sc_cluster8, features = c("SCCPDH", 'LAG3', 'NUDT21', 'MKI67'), 
        group.by = "tissue.type", pt.size = 0, ncol = 1)



### (3)Re plot half VlnPlot ----
gene.symbol="PCF11"
g2=VlnPlot(scObj, 
           features = c(gene.symbol),
           #features = c("CD68"), 
        #group.by="tissue.type", 
        group.by="cell.type", 
        #cols = color.head,
        pt.size=0)+RotatedAxis(); g2 #fig1
#  scale_color_manual(color.head)
head(g2$data)
#                           PCF11 ident
#AAACGGGCATGACGGA-1  1.370958e-05         T.cells
#AAAGATGAGCAGACTG-1 -5.646982e-06         T.cells


# BiocManager::install("gghalves")
# options(BioC_mirror="https://mirrors.tuna.tsinghua.edu.cn/bioconductor") 
# options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
library(gghalves)
dat=data.frame(
  ident=g2$data$ident,
  value=g2$data[,1]
)
# rm 0 values?
dim(dat)
dat=dat[which(dat$value>1e-4),]
dim(dat)
ggplot(dat, aes(x = ident, y = value, fill = ident) )+
  geom_jitter(size=0.1, color="grey80") +
  geom_violin(trim=F, #末尾是否截断
              aes(fill=ident), #填充颜色
              scale="width", #默认是按面积 area，推荐改为width
              bw=0.1)+ #取样宽度，越小越精细模拟原始值
  geom_boxplot(width=0.1, fill="white", show.legend = F, 
               outlier.colour = "#00000000") + #离群值 透明
  theme_classic(base_size = 14)+
  RotatedAxis()+
  ggtitle(gene.symbol) #fig2
#

ggplot(dat, aes(x = ident, y = value, fill = ident) ) +
  geom_half_point(size=0.1, side="l", aes(color=ident) )+
  geom_half_violin(
    aes(fill = ident),
    alpha=0.7,
    #position = position_nudge(x = .15, y = 0),
    #adjust=0.5,
    trim=F, 
    colour=NA, #描边颜色
    bw=0.1,
    side = 'r',
    show.legend = F
  )+
  geom_boxplot(width=0.1, fill="white", show.legend = F, 
               outlier.colour = "#00000000") + #离群值 透明
  #scale_fill_manual(values = color.head) +
  #scale_color_manual(values = color.head) +
  theme_classic(base_size = 14)+
  RotatedAxis()+
  labs(x="", y="log(Normalized counts +1)", title = bquote( italic(.(gene.symbol)) ) )+
  guides(colour = guide_legend(override.aes = list(alpha = 1, size=3))) #fig3
#
#
FeaturePlot(scObj, features = c("PCF11", "CD300LF", "CD274", "NUDT21", "CPSF6", "CPSF3"), 
            ncol = 2,
            cols =c("lightgrey", "red"), order=T)
</pre>
